Libs e utils
library(tidyverse)
library(factoextra)
library(lubridate)
library(caret)
library(leaflet)
library(viridisLite)
library(gridExtra)
library(ggpubr)
library(corrplot)
library(ROCR)
getmode <- function(v) {
uniqv <- unique(v[which(!is.na(v))])
uniqv[which.max(tabulate(match(v, uniqv)))]
}
elbow <- function(data,k.max){
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
print(plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares"))
}
rocp = function(mod,test,y_test){
pred = predict(mod,test,"prob")[,2]
p = prediction(pred,y_test)
perf = performance(p,measure = "tpr", x.measure = "fpr")
print(plot(perf,main = "ROC Curve"))
auc.perf = performance(p, measure = "auc")
return (auc.perf@y.values)
}
plot_density_pred = function(mod,test){
pred = predict(mod,test,"prob")
pred = cbind(pred,raintomorrow=test$raintomorrow)
names(pred)[1:2] = c("No","Yes")
print(ggplot(pred,aes(Yes,col=raintomorrow)) + geom_density() + theme_bw() + ggtitle("Distribuição das probabilidades preditas de Yes"))
}
Leitura dos dados
setwd("./processo_cientista-master/data")
rain_data <- read_csv("rain_data_aus.csv",
col_types = cols(evaporation = col_double(),
sunshine = col_double()))
for(i in 1:8){
file = paste0("wind_table_0",i,".csv")
if(i==1){
wind_data = read_csv(file)
cols = names(wind_data)
}
else{
aux = read_csv(file)
names(aux) = cols
wind_data = rbind(wind_data,aux)
}
}
coords = read_csv("city_coord.csv")
O problema consiste em prever se irá ou não chover no dia seguinte. Os dados estão divididos em duas bases diferentes uma delas lida com informações metereologicas mais gerais e a outra com informações específicas sobre os ventos, inicialmente elas serão tratadas individualmente, mas eventualmente serão unidas.
Para a base de dados sobre a chuva, ela possui 142193 observações e 23 variáveis que são:
## [1] "date" "location" "mintemp" "maxtemp" "rainfall" "evaporation"
## [7] "sunshine" "humidity9am" "humidity3pm" "pressure9am" "pressure3pm" "cloud9am"
## [13] "cloud3pm" "temp9am" "temp3pm" "raintoday" "amountOfRain" "raintomorrow"
## [19] "temp" "humidity" "precipitation3pm" "precipitation9am" "modelo_vigente"
A variável resposta é raintomorrow.
Inicialmente convertendo as colunas do tipo character para factor e validando os demais formatos, também removendo a coluna com as probabilidades do modelo vigente.
rain_data = rain_data %>% mutate_if(is.character, as.factor)
rain_data = rain_data %>% select(-modelo_vigente)
summary(rain_data)
## date location mintemp maxtemp rainfall evaporation sunshine
## Min. :2007-11-01 Canberra: 3418 Min. :-8.50 Min. :-4.80 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.:2011-01-06 Sydney : 3337 1st Qu.: 7.60 1st Qu.:17.90 1st Qu.: 0.00 1st Qu.: 2.60 1st Qu.: 4.90
## Median :2013-05-27 Perth : 3193 Median :12.00 Median :22.60 Median : 0.00 Median : 4.80 Median : 8.50
## Mean :2013-04-01 Darwin : 3192 Mean :12.19 Mean :23.23 Mean : 2.35 Mean : 5.47 Mean : 7.62
## 3rd Qu.:2015-06-12 Hobart : 3188 3rd Qu.:16.80 3rd Qu.:28.20 3rd Qu.: 0.80 3rd Qu.: 7.40 3rd Qu.:10.60
## Max. :2017-06-25 Brisbane: 3161 Max. :33.90 Max. :48.10 Max. :371.00 Max. :145.00 Max. :14.50
## (Other) :122704 NA's :637 NA's :322 NA's :1406 NA's :60843 NA's :67816
## humidity9am humidity3pm pressure9am pressure3pm cloud9am cloud3pm temp9am
## Min. : 0.00 Min. : 0.00 Min. : 980.5 Min. : 977.1 Min. :0.00 Min. :0.0 Min. :-7.20
## 1st Qu.: 57.00 1st Qu.: 37.00 1st Qu.:1012.9 1st Qu.:1010.4 1st Qu.:1.00 1st Qu.:2.0 1st Qu.:12.30
## Median : 70.00 Median : 52.00 Median :1017.6 Median :1015.2 Median :5.00 Median :5.0 Median :16.70
## Mean : 68.84 Mean : 51.48 Mean :1017.7 Mean :1015.3 Mean :4.44 Mean :4.5 Mean :16.99
## 3rd Qu.: 83.00 3rd Qu.: 66.00 3rd Qu.:1022.4 3rd Qu.:1020.0 3rd Qu.:7.00 3rd Qu.:7.0 3rd Qu.:21.60
## Max. :100.00 Max. :100.00 Max. :1041.0 Max. :1039.6 Max. :9.00 Max. :9.0 Max. :40.20
## NA's :1774 NA's :3610 NA's :14014 NA's :13981 NA's :53657 NA's :57094 NA's :904
## temp3pm raintoday amountOfRain raintomorrow temp humidity precipitation3pm precipitation9am
## Min. :-5.40 No :109332 Min. : 0.000 No :110316 Min. :-3.76 Min. : 2.00 Min. : 0.00 Min. :-17.74
## 1st Qu.:16.60 Yes : 31455 1st Qu.: 0.000 Yes: 31877 1st Qu.:22.52 1st Qu.: 44.00 1st Qu.: 8.00 1st Qu.: 6.65
## Median :21.10 NA's: 1406 Median : 0.000 Median :28.52 Median : 63.20 Median :10.00 Median : 10.00
## Mean :21.69 Mean : 2.361 Mean :28.51 Mean : 61.99 Mean :10.01 Mean : 10.00
## 3rd Qu.:26.40 3rd Qu.: 0.800 3rd Qu.:35.48 3rd Qu.: 80.00 3rd Qu.:12.00 3rd Qu.: 13.39
## Max. :46.70 Max. :371.000 Max. :59.72 Max. :122.00 Max. :26.00 Max. : 32.48
## NA's :2726 NA's :322 NA's :3610
Pode-se notar um grande número de valores faltantes.
Aproximadamente 9.08% dos valores da tabela são faltantes, isso será discutido mais a frente.
Já a outra base de dados, que lida com informações específicas sobre o vento numa localidade específica e uma data específica, possui 164386 observações e 8 variáveis que são:
## [1] "date" "location" "wind_gustdir" "wind_gustspeed" "wind_dir9am" "wind_dir3pm" "wind_speed9am"
## [8] "wind_speed3pm"
Corrigindo os campos character para factor e validando os demais tipos
wind_data = wind_data %>% mutate_if(is.character, as.factor)
summary(wind_data)
## date location wind_gustdir wind_gustspeed wind_dir9am wind_dir3pm wind_speed9am
## Min. :2007-11-01 Canberra: 3877 W : 11451 Min. : 6.00 N :13294 SE : 12019 Min. : 0.00
## 1st Qu.:2011-06-07 Sydney : 3798 SE : 10654 1st Qu.: 31.00 SE :10503 W : 11523 1st Qu.: 7.00
## Median :2014-01-13 Perth : 3655 N : 10513 Median : 39.00 E :10397 S : 10971 Median : 13.00
## Mean :2013-09-25 Darwin : 3653 E : 10446 Mean : 39.86 SSE :10289 WSW : 10729 Mean : 13.96
## 3rd Qu.:2016-04-24 Hobart : 3650 WSW : 10320 3rd Qu.: 48.00 NW :10039 SW : 10630 3rd Qu.: 19.00
## Max. :2017-06-25 Brisbane: 3618 (Other):100424 Max. :135.00 (Other):98436 (Other):103679 Max. :130.00
## (Other) :142135 NA's : 10578 NA's :10518 NA's :11428 NA's : 4835 NA's :1453
## wind_speed3pm
## Min. : 0.00
## 1st Qu.:13.00
## Median :19.00
## Mean :18.57
## 3rd Qu.:24.00
## Max. :87.00
## NA's :3583
Já para esses dados os valores faltantes correspondem à 3.22% das observações, valor menor que da base de chuvas.
Dentro dos dados de chuva existem 18 colunas numéricas, 3 categóricas e uma de data, que serão avaliadas a seguir.
Essa é a variável resposta do problema, como é possível notar ela está distribuida de maneira bastante desbalanceada entre os seus possíveis resultados, o que demanda algumas mudanças na hora do desenvolvimento do modelo.
ggplot(rain_data,aes(raintomorrow)) + geom_bar() + theme_bw() + ylab("Contagem")
Essa varíavel discrimina se no dia de hoje teve-se algum relato de chuva ou não
p1 = ggplot(rain_data,aes(raintoday)) + geom_bar()+ theme_bw() + ylab("Contagem")
p2=ggplot(rain_data,aes(raintoday, fill = raintomorrow)) + geom_bar(position = "dodge") + theme_bw() + ylab("Contagem")
grid.arrange(p1,p2,ncol=1)
É possível notar que nos dias que choveu a proporção de vezes que choveu no dia seguinte é muito maior.
prop.table(table(Choveu.Hoje = rain_data$raintoday, Chove.Amanha = rain_data$raintomorrow),margin = 1)
## Chove.Amanha
## Choveu.Hoje No Yes
## No 0.8481323 0.1518677
## Yes 0.5359402 0.4640598
Localidade da medição, quase todas elas possuem a mesma frequência nos dados com exceção de Hobart, Nhil e Tuggeranong que distoam mais.
p1 = ggplot(rain_data,aes(location)) + geom_bar() + theme_bw() + ylab("Contagem") + theme(axis.text.x = element_text(angle = 90))
p2 = rain_data %>% group_by(location,raintomorrow) %>% summarise(total = n()) %>% mutate(Proporcao=total/sum(total)) %>%
ggplot(aes(x=location,y=Proporcao, fill = raintomorrow)) + geom_bar(position = "dodge",stat="identity") + theme_bw() + ylab("Proporção") + theme(axis.text.x = element_text(angle = 90))
grid.arrange(p1,p2)
É notavel também que a proporção vezes que choveu no dia seguinte varia bastante entre as localidades, discriminando bem a variável resposta.
O Volume médio de chuva possui uma variação considerável levando em conta as localidades.
rain_data %>% filter(raintoday == "Yes") %>% group_by(location) %>% summarise(Media_Volume_Chuva = mean(amountOfRain)) %>%
ggplot(aes(location,Media_Volume_Chuva)) + geom_bar(stat="Identity") + theme_bw() + ggtitle("Média do volume de chuvas por localidade") + theme(axis.text.x = element_text(angle = 90))
Variável que mede a quantidade de chuva, entretanto sem informação se é sobre o dia atual ou o próximo.
ggplot(rain_data,aes(amountOfRain,col=raintomorrow)) + geom_density() + theme_bw()
aux = rain_data %>% select(amountOfRain,raintomorrow) %>% mutate(Zero_chuva = ifelse(amountOfRain==0,1,0),
raintomorrow = ifelse(raintomorrow=='Yes',1,0))
table(Zero_Chuva = aux$Zero_chuva, Chove_Amanha = aux$raintomorrow)
## Chove_Amanha
## Zero_Chuva 0 1
## 0 19239 31877
## 1 91077 0
Ou seja para os dias com amountOfRain = 0, 100% das vezes não choverá amanhã. Variável nao será utilizada para a modelagem pois muito provavelmente se refere à algum evento futuro.
Variáveis com muitos outliers: rainfall, evaporation, amountofrain.
As outras variáveis mostram um comportamento mais controlado, tendo a mediana como centro dos dados.
rain_data = rain_data %>% mutate(Dia = as.factor(day(date)),
Mes = as.factor(month(date)),
Ano = as.factor(year(date)))
plots = list()
plots_resp = list()
i=1
for (col in names_numerical_columns) {
p = ggplot(rain_data,aes_string(x="1",y=col))+geom_boxplot() +theme_bw() + ggtitle(col)
p2 = ggplot(rain_data,aes_string(col,col="raintomorrow")) + geom_density() + theme_bw() + ggtitle(col)
plots[[i]] = p
plots_resp[[i]] = p2
i = i+1
}
x = length(plots)
cols = round(sqrt(x),0)
rows = ceiling(x/cols)
ggarrange(plotlist = plots, ncol=cols, nrow = rows)
Comportamento das variáveis numéricas realizando a separação pela variável resposta
ggarrange(plotlist = plots_resp, ncol=cols, nrow = rows)
Individualmente nenhuma das variáveis possuiria um bom desempenho para dizer se chove ou não amanhã, entretanto algumas variáveis acabam chamando atenção como sunshine, nos dias onde essa variável mostra valores mais altos existe uma tendência à não chover no dia seguinte ou humidity3pm onde valores mais baixos mostram também a mesma tendência, entre outras.
corr = cor(rain_data[,names_numerical_columns], use = "complete.obs")
corrplot(corr,'number')
Pode-se notar algumas correlações altas esperadas como temperatura minima e a temperatura da as 9 da manhã e a quantidade de horas de sol com a quantidade de nuvens no céu, outros pontos são:
umidade e a umidade medida as 3 da tarde mostram uma forte correlação também, indicando que a médida de umidade normalmente foi feita na parte da tarde, o mesmo para a temperatura.
umidade 9 da manhã e das 3 tarde possuem média correlação indicando que essa medida não se altera muito ao longo do dia.
umidade e a quantidade de horas de sol possuem uma correlação média.
a variável que mede evaporação com a temperatura possuem uma correlação média
rain_data = rain_data %>% mutate(Ano = as.factor(year(date)),
Mes = as.factor(month(date)),
Dia = as.factor(day(date)),
Week = as.factor(week(date)))
p1 = rain_data %>% group_by(Ano) %>% summarise(Qtd_Chuva = sum(rainfall,na.rm = T)) %>%
ggplot(aes(x=Ano,y=Qtd_Chuva,group=1)) + geom_line() + theme_bw() + theme(axis.text.x = element_text(angle = 90))
p2 = rain_data %>% group_by(Ano,Mes) %>% arrange(Ano,Mes) %>% summarise(Qtd_Chuva = sum(rainfall,na.rm = T)) %>%
ggplot(aes(x=Mes,y=Qtd_Chuva,group=factor(Ano),col=factor(Ano))) + geom_line() + theme_bw() + theme(axis.text.x = element_text(angle = 90))
p3 = rain_data %>% group_by(Ano,Week) %>% arrange(Ano,Week) %>% summarise(Qtd_Chuva = sum(rainfall,na.rm = T)) %>%
ggplot(aes(x=Week,y=Qtd_Chuva,group=factor(Ano),col=factor(Ano))) + geom_line() + theme_bw() + theme(axis.text.x = element_text(angle = 90))
grid.arrange(p1,p2,p3,ncol=1)
Aparentemente não existe nenhum tipo de padrão ao longo dos anos nos dados.
rain_data %>% group_by(Ano,Mes) %>% count() %>% mutate (AnoMes = factor(paste0(Ano,'-',Mes))) %>%
ggplot(aes(x=AnoMes,y=n)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x = element_text(angle = 90))
Com exceção do período 2007/11 - 2008/09 as quantidades das medidas são muito próximas entre os meses
Existe diferença na proporção das vezes que choveu em cada um dos meses
rain_data %>% group_by(Mes) %>% mutate(choveu = ifelse(raintoday=='Yes',1,0)) %>% summarise(prop_1 = sum(choveu, na.rm = T), freq = n()) %>% mutate (prop = prop_1/freq) %>% ggplot(aes(x=Mes,y=prop)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x = element_text(angle = 90))
prop.table(table(rain_data$Mes,rain_data$raintomorrow),margin=1)
##
## No Yes
## 1 0.8066713 0.1933287
## 2 0.7903088 0.2096912
## 3 0.7866677 0.2133323
## 4 0.7821511 0.2178489
## 5 0.7747223 0.2252777
## 6 0.7381548 0.2618452
## 7 0.7307921 0.2692079
## 8 0.7474919 0.2525081
## 9 0.7702953 0.2297047
## 10 0.8043036 0.1956964
## 11 0.7874531 0.2125469
## 12 0.7918594 0.2081406
Teste chi quadrado para verificar se os grupos são estatisticamente diferentes, obtendo um p-valor de quase zero, confirmando a hipótese de efeito da variável mês.
chisq.test(rain_data$Mes,rain_data$raintomorrow)
##
## Pearson's Chi-squared test
##
## data: rain_data$Mes and rain_data$raintomorrow
## X-squared = 469.53, df = 11, p-value < 2.2e-16
Dentro dos dados de chuva existem 3 colunas numéricas, 4 categóricas e uma de data, que serão avaliadas a seguir.
Dentre as variáveis categóricas da base de ventos temos a que fala sobre a localidade da medição as outras três trazem informações sobre a direção que o vento vem.
A varíavel sobre a localização da medição, quase todas as localidade tiveram a praticamente mesma quantidade de medições com exceção de Hobart, Nhil e Uluru.
wind_data %>% group_by(location) %>% count() %>% ggplot(aes(location,n)) + geom_bar(stat='identity') + theme_bw() + theme(axis.text.x = element_text(angle = 90))
Existem 3 varíaveis que dizem a região que o vento está vindo:
É possivel notar que existe diferença entre as proporções de dias que tiveram ou não chuva no dia seguinte dentre as direções dos ventos, entretanto os gráficos para as 3 variáveis são bastante parecidos. Entretanto a interação da variável de direção com a sua velocidade respectiva, possa ser importante no problema.
Adicionando a variável resposta do problema para a construção de algumas visualizações
wind_data = wind_data %>% left_join(rain_data %>% select(date,location,raintomorrow))
wind_data %>% group_by(wind_gustdir,raintomorrow) %>% summarise(n = n()) %>% mutate(Proporcao = n/sum(n)) %>% ggplot(aes(x=wind_gustdir,y=Proporcao,fill=raintomorrow)) + geom_bar(position = "dodge",stat='identity') + theme_bw() + ggtitle("wind_gustdir")
wind_data %>% group_by(wind_dir9am,raintomorrow) %>% summarise(n = n()) %>% mutate(Proporcao = n/sum(n)) %>% ggplot(aes(x=wind_dir9am,y=Proporcao,fill=raintomorrow)) + geom_bar(position = "dodge",stat='identity') + theme_bw() + ggtitle("wind_dir9am")
wind_data %>% group_by(wind_dir3pm,raintomorrow) %>% summarise(n = n()) %>% mutate(Proporcao = n/sum(n)) %>% ggplot(aes(x=wind_dir3pm,y=Proporcao,fill=raintomorrow)) + geom_bar(position = "dodge",stat='identity') + theme_bw() + ggtitle("wind_dir3pm")
Essas variáveis medem a velocidade do vento:
ggplot(wind_data,aes(x=1,wind_gustspeed)) + geom_boxplot() + theme_bw() + ggtitle("wind_gustspeed")
ggplot(wind_data,aes(x=1,wind_speed9am)) + geom_boxplot() + theme_bw() + ggtitle("wind_speed9am")
ggplot(wind_data,aes(x=1,wind_speed3pm)) + geom_boxplot() + theme_bw() + ggtitle("wind_speed3pm")
Relações com a variável resposta.
ggplot(wind_data,aes(wind_gustspeed, col = raintomorrow)) + geom_density() + theme_bw() + ggtitle("wind_gustspeed")
## Warning: Removed 10518 rows containing non-finite values (stat_density).
ggplot(wind_data,aes(wind_speed9am, col = raintomorrow)) + geom_density() + theme_bw() + ggtitle("wind_speed9am")
## Warning: Removed 1453 rows containing non-finite values (stat_density).
ggplot(wind_data,aes(wind_speed3pm, col = raintomorrow)) + geom_density() + theme_bw() + ggtitle("wind_speed3pm")
## Warning: Removed 3583 rows containing non-finite values (stat_density).
wind_data = wind_data %>% select(-raintomorrow)
wind_gustspeed é a que possui maior distinção entre as observações que chovem ou não no dia seguinte, mas nada muito conclusivo. Como foi dito anteriormente essa é uma avaliação da variável individualmente sem levar em conta interação com outras variáveis.
Como visto anteriormente existem muitos valores faltantes na base de dados de chuvas.
summary(rain_data)
## date location mintemp maxtemp rainfall evaporation sunshine
## Min. :2007-11-01 Canberra: 3418 Min. :-8.50 Min. :-4.80 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.:2011-01-06 Sydney : 3337 1st Qu.: 7.60 1st Qu.:17.90 1st Qu.: 0.00 1st Qu.: 2.60 1st Qu.: 4.90
## Median :2013-05-27 Perth : 3193 Median :12.00 Median :22.60 Median : 0.00 Median : 4.80 Median : 8.50
## Mean :2013-04-01 Darwin : 3192 Mean :12.19 Mean :23.23 Mean : 2.35 Mean : 5.47 Mean : 7.62
## 3rd Qu.:2015-06-12 Hobart : 3188 3rd Qu.:16.80 3rd Qu.:28.20 3rd Qu.: 0.80 3rd Qu.: 7.40 3rd Qu.:10.60
## Max. :2017-06-25 Brisbane: 3161 Max. :33.90 Max. :48.10 Max. :371.00 Max. :145.00 Max. :14.50
## (Other) :122704 NA's :637 NA's :322 NA's :1406 NA's :60843 NA's :67816
## humidity9am humidity3pm pressure9am pressure3pm cloud9am cloud3pm temp9am
## Min. : 0.00 Min. : 0.00 Min. : 980.5 Min. : 977.1 Min. :0.00 Min. :0.0 Min. :-7.20
## 1st Qu.: 57.00 1st Qu.: 37.00 1st Qu.:1012.9 1st Qu.:1010.4 1st Qu.:1.00 1st Qu.:2.0 1st Qu.:12.30
## Median : 70.00 Median : 52.00 Median :1017.6 Median :1015.2 Median :5.00 Median :5.0 Median :16.70
## Mean : 68.84 Mean : 51.48 Mean :1017.7 Mean :1015.3 Mean :4.44 Mean :4.5 Mean :16.99
## 3rd Qu.: 83.00 3rd Qu.: 66.00 3rd Qu.:1022.4 3rd Qu.:1020.0 3rd Qu.:7.00 3rd Qu.:7.0 3rd Qu.:21.60
## Max. :100.00 Max. :100.00 Max. :1041.0 Max. :1039.6 Max. :9.00 Max. :9.0 Max. :40.20
## NA's :1774 NA's :3610 NA's :14014 NA's :13981 NA's :53657 NA's :57094 NA's :904
## temp3pm raintoday amountOfRain raintomorrow temp humidity precipitation3pm precipitation9am
## Min. :-5.40 No :109332 Min. : 0.000 No :110316 Min. :-3.76 Min. : 2.00 Min. : 0.00 Min. :-17.74
## 1st Qu.:16.60 Yes : 31455 1st Qu.: 0.000 Yes: 31877 1st Qu.:22.52 1st Qu.: 44.00 1st Qu.: 8.00 1st Qu.: 6.65
## Median :21.10 NA's: 1406 Median : 0.000 Median :28.52 Median : 63.20 Median :10.00 Median : 10.00
## Mean :21.69 Mean : 2.361 Mean :28.51 Mean : 61.99 Mean :10.01 Mean : 10.00
## 3rd Qu.:26.40 3rd Qu.: 0.800 3rd Qu.:35.48 3rd Qu.: 80.00 3rd Qu.:12.00 3rd Qu.: 13.39
## Max. :46.70 Max. :371.000 Max. :59.72 Max. :122.00 Max. :26.00 Max. : 32.48
## NA's :2726 NA's :322 NA's :3610
## Dia Mes Ano Week
## 1 : 4688 5 :13055 2016 :17508 19 : 2960
## 15 : 4688 3 :13036 2014 :17400 22 : 2957
## 16 : 4687 1 :12921 2015 :17231 20 : 2948
## 2 : 4686 6 :12389 2009 :16595 23 : 2946
## 13 : 4686 10 :11804 2010 :16419 10 : 2945
## 6 : 4684 7 :11779 2013 :16097 25 : 2944
## (Other):114074 (Other):67209 (Other):40943 (Other):124493
Por início será utilizado a média mensal ao longo dos anos para dada localidade, entretanto para as variáveis citadas como com muitos outliers será utilizado a mediana.
medias_mes_local = rain_data %>% group_by(location,Mes) %>%
summarise_at(c(vars(mintemp:maxtemp,sunshine:temp3pm,temp,humidity),vars(rainfall,evaporation,amountOfRain)),funs(mean(.,na.rm = T), median(.,na.rm = T)))
rain_data_clean_1 = rain_data %>% left_join(medias_mes_local, by = c("location","Mes")) %>%
mutate(mintemp = coalesce(mintemp, mintemp_mean),
maxtemp = coalesce(maxtemp, maxtemp_mean),
rainfall = coalesce(rainfall, rainfall_median),
evaporation = coalesce(evaporation, evaporation_median),
sunshine = coalesce(sunshine, sunshine_mean),
humidity9am = coalesce(humidity9am, humidity9am_mean),
humidity3pm = coalesce(humidity3pm, humidity3pm_mean),
pressure9am = coalesce(pressure9am, pressure9am_mean),
pressure3pm = coalesce(pressure3pm, pressure3pm_mean),
cloud9am = coalesce(cloud9am, cloud9am_mean),
cloud3pm = coalesce(cloud3pm, cloud3pm_mean),
temp9am = coalesce(temp9am, temp9am_mean),
temp3pm = coalesce(temp3pm, temp3pm_mean),
temp = coalesce(temp, temp_mean),
humidity = coalesce(humidity, humidity_mean)) %>% select(-contains("mean"),-contains("median"))
Redução de quase 100 mil NA
sum(is.na(rain_data_clean_1))
## [1] 190589
Os campos que continuam com NA, não obtiveram nenhuma medida da variável dentro daquela cidade no período todo
summary(rain_data_clean_1)
## date location mintemp maxtemp rainfall evaporation sunshine
## Min. :2007-11-01 Canberra: 3418 Min. :-8.50 Min. :-4.80 Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.:2011-01-06 Sydney : 3337 1st Qu.: 7.60 1st Qu.:17.90 1st Qu.: 0.000 1st Qu.: 2.60 1st Qu.: 5.40
## Median :2013-05-27 Perth : 3193 Median :12.00 Median :22.60 Median : 0.000 Median : 4.60 Median : 8.20
## Mean :2013-04-01 Darwin : 3192 Mean :12.19 Mean :23.23 Mean : 2.327 Mean : 5.31 Mean : 7.64
## 3rd Qu.:2015-06-12 Hobart : 3188 3rd Qu.:16.80 3rd Qu.:28.20 3rd Qu.: 0.600 3rd Qu.: 7.20 3rd Qu.:10.30
## Max. :2017-06-25 Brisbane: 3161 Max. :33.90 Max. :48.10 Max. :371.000 Max. :145.00 Max. :14.50
## (Other) :122704 NA's :45482 NA's :52071
## humidity9am humidity3pm pressure9am pressure3pm cloud9am cloud3pm temp9am
## Min. : 0.00 Min. : 0.00 Min. : 980.5 Min. : 977.1 Min. :0.00 Min. :0.00 Min. :-7.20
## 1st Qu.: 57.00 1st Qu.: 37.00 1st Qu.:1013.0 1st Qu.:1010.5 1st Qu.:2.00 1st Qu.:2.00 1st Qu.:12.20
## Median : 70.00 Median : 52.00 Median :1017.6 Median :1015.2 Median :5.00 Median :5.00 Median :16.70
## Mean : 68.92 Mean : 51.59 Mean :1017.6 Mean :1015.2 Mean :4.57 Mean :4.58 Mean :16.96
## 3rd Qu.: 83.00 3rd Qu.: 66.00 3rd Qu.:1022.3 3rd Qu.:1019.9 3rd Qu.:7.00 3rd Qu.:7.00 3rd Qu.:21.50
## Max. :100.00 Max. :100.00 Max. :1041.0 Max. :1039.6 Max. :9.00 Max. :9.00 Max. :40.20
## NA's :11781 NA's :11781 NA's :34034 NA's :34034
## temp3pm raintoday amountOfRain raintomorrow temp humidity precipitation3pm precipitation9am
## Min. :-5.40 No :109332 Min. : 0.000 No :110316 Min. :-3.76 Min. : 2.00 Min. : 0.00 Min. :-17.74
## 1st Qu.:16.60 Yes : 31455 1st Qu.: 0.000 Yes: 31877 1st Qu.:22.52 1st Qu.: 44.00 1st Qu.: 8.00 1st Qu.: 6.65
## Median :21.10 NA's: 1406 Median : 0.000 Median :28.52 Median : 63.20 Median :10.00 Median : 10.00
## Mean :21.72 Mean : 2.361 Mean :28.51 Mean : 62.11 Mean :10.01 Mean : 10.00
## 3rd Qu.:26.50 3rd Qu.: 0.800 3rd Qu.:35.48 3rd Qu.: 80.00 3rd Qu.:12.00 3rd Qu.: 13.39
## Max. :46.70 Max. :371.000 Max. :59.72 Max. :122.00 Max. :26.00 Max. : 32.48
##
## Dia Mes Ano Week
## 1 : 4688 5 :13055 2016 :17508 19 : 2960
## 15 : 4688 3 :13036 2014 :17400 22 : 2957
## 16 : 4687 1 :12921 2015 :17231 20 : 2948
## 2 : 4686 6 :12389 2009 :16595 23 : 2946
## 13 : 4686 10 :11804 2010 :16419 10 : 2945
## 6 : 4684 7 :11779 2013 :16097 25 : 2944
## (Other):114074 (Other):67209 (Other):40943 (Other):124493
Para resolver isso será utilizado a média mensal de um grupo de localidades, construido a partir de uma análise de clusters. Como utilizarei os dados de ventos também para a construção desses clusters, irei tratá-los primeiro.
Tratativa dos dados de vento para realizar um join com a tabela principal com intuito de obter mais informações para tratar os NA restantes.
summary(wind_data)
## date location wind_gustdir wind_gustspeed wind_dir9am wind_dir3pm wind_speed9am
## Min. :2007-11-01 Canberra: 3877 W : 11451 Min. : 6.00 N :13294 SE : 12019 Min. : 0.00
## 1st Qu.:2011-06-07 Sydney : 3798 SE : 10654 1st Qu.: 31.00 SE :10503 W : 11523 1st Qu.: 7.00
## Median :2014-01-13 Perth : 3655 N : 10513 Median : 39.00 E :10397 S : 10971 Median : 13.00
## Mean :2013-09-25 Darwin : 3653 E : 10446 Mean : 39.86 SSE :10289 WSW : 10729 Mean : 13.96
## 3rd Qu.:2016-04-24 Hobart : 3650 WSW : 10320 3rd Qu.: 48.00 NW :10039 SW : 10630 3rd Qu.: 19.00
## Max. :2017-06-25 Brisbane: 3618 (Other):100424 Max. :135.00 (Other):98436 (Other):103679 Max. :130.00
## (Other) :142135 NA's : 10578 NA's :10518 NA's :11428 NA's : 4835 NA's :1453
## wind_speed3pm
## Min. : 0.00
## 1st Qu.:13.00
## Median :19.00
## Mean :18.57
## 3rd Qu.:24.00
## Max. :87.00
## NA's :3583
Existem dias com mais de uma observação para a mesma cidade
dup = wind_data %>% group_by(location,date) %>% count()
table(dup$n)
##
## 1 2
## 120000 22193
As repetições são linhas duplicadas pois a quantidade de linhas duplicadas e do conjunto location/date que se repetem é a mesma
sum(duplicated(wind_data))
## [1] 22193
Removendo as linhas duplicadas
wind_data = wind_data[!duplicated(wind_data),]
Para os campos onde a velocidade do vento é zero, irei substituir a direção que possui NA por X
wind_data = wind_data %>% mutate_if(is.factor, as.character)
wind_data$wind_gustdir[wind_data$wind_gustspeed==0] = "X"
wind_data$wind_dir9am[wind_data$wind_speed9am==0] = "X"
wind_data$wind_dir3pm[wind_data$wind_speed3pm==0] = "X"
wind_data = wind_data %>% mutate_if(is.character, as.factor)
summary(wind_data)
## date location wind_gustdir wind_gustspeed wind_dir9am wind_dir3pm wind_speed9am
## Min. :2007-11-01 Canberra: 3418 W : 9780 Min. : 6.00 N :11393 SE :10663 Min. : 0
## 1st Qu.:2011-01-06 Sydney : 3337 SE : 9309 1st Qu.: 31.00 SE : 9162 W : 9911 1st Qu.: 7
## Median :2013-05-27 Perth : 3193 E : 9071 Median : 39.00 E : 9024 S : 9598 Median : 13
## Mean :2013-04-01 Darwin : 3192 N : 9033 Mean : 39.98 SSE : 8966 WSW : 9329 Mean : 14
## 3rd Qu.:2015-06-12 Hobart : 3188 SSE : 8993 3rd Qu.: 48.00 X : 8612 SW : 9182 3rd Qu.: 19
## Max. :2017-06-25 Brisbane: 3161 (Other):86677 Max. :135.00 (Other):93635 (Other):90828 Max. :130
## (Other) :122704 NA's : 9330 NA's :9270 NA's : 1401 NA's : 2682 NA's :1348
## wind_speed3pm
## Min. : 0.00
## 1st Qu.:13.00
## Median :19.00
## Mean :18.64
## 3rd Qu.:24.00
## Max. :87.00
## NA's :2630
sum(is.na(wind_data))
## [1] 26661
Existem 2 cidades que não possuem nenhuma informação sobre wind_gustdir, logo não será possível utilizar a maior direção com maior frequência naquela região como substituição para esses casos.
wind_data$location = as.factor(wind_data$location)
wind_data$wind_gustdir = as.factor(wind_data$wind_gustdir)
wind_data$wind_dir9am = as.factor(wind_data$wind_dir9am)
wind_data$wind_dir3pm = as.factor(wind_data$wind_dir3pm)
winddir_location_mes = wind_data %>% group_by(location,Mes = month(date)) %>% summarise_at(vars(contains("dir")),funs(getmode))
summary(winddir_location_mes)
## location Mes wind_gustdir wind_dir9am wind_dir3pm
## Adelaide : 12 Min. : 1.00 E : 70 N : 79 W : 59
## Albany : 12 1st Qu.: 3.75 W : 68 X : 69 WSW : 52
## Albury : 12 Median : 6.50 N : 53 E : 62 N : 48
## AliceSprings : 12 Mean : 6.50 SE : 52 SE : 45 SE : 48
## BadgerysCreek: 12 3rd Qu.: 9.25 SW : 40 SW : 39 S : 47
## Ballarat : 12 Max. :12.00 (Other):281 W : 34 SW : 42
## (Other) :516 NA's : 24 (Other):260 (Other):292
Será utilizado a direção do vento mais frequente naquela região dentro do dado mês como substituição para os valores faltantes.
names(winddir_location_mes)[-c(1,2)] = paste0("Mode_",names(winddir_location_mes)[-c(1,2)])
wind_data = wind_data %>% mutate(Mes = month(wind_data$date))
wind_data_clean_1 = wind_data %>% left_join(winddir_location_mes, by = c("location","Mes")) %>%
mutate(wind_gustdir = coalesce(wind_gustdir, Mode_wind_gustdir),
wind_dir9am = coalesce(wind_dir9am, Mode_wind_dir9am),
wind_dir3pm = coalesce(wind_dir3pm, Mode_wind_dir3pm)) %>% select(-contains("Mode_"))
Para as velocidades utilizarei o valor mensal para cada localidade como substituição para os valores faltantes.
windspd_gust_location_mes = wind_data_clean_1 %>% group_by(location,Mes,wind_gustdir) %>%
summarise(Media_wind_gustspeed = mean(wind_gustspeed,na.rm = T))
wind_data_clean_2 = wind_data_clean_1 %>% left_join(windspd_gust_location_mes, by = c("location","Mes","wind_gustdir")) %>%
mutate(wind_gustspeed = coalesce(wind_gustspeed, Media_wind_gustspeed)) %>%
select(-Media_wind_gustspeed)
windspd_9am_location_mes = wind_data_clean_1 %>% group_by(location,Mes,wind_dir9am) %>%
summarise(Media_wind_speed9am = mean(wind_speed9am,na.rm = T))
wind_data_clean_3 = wind_data_clean_2 %>% left_join(windspd_9am_location_mes, by = c("location","Mes","wind_dir9am")) %>%
mutate(wind_speed9am = coalesce(wind_speed9am, Media_wind_speed9am)) %>%
select(-Media_wind_speed9am)
windspd_3pm_location_mes = wind_data_clean_1 %>% group_by(location,Mes,wind_dir3pm) %>%
summarise(Media_wind_speed3pm = mean(wind_speed3pm,na.rm = T))
wind_data_clean_4 = wind_data_clean_3 %>% left_join(windspd_3pm_location_mes, by = c("location","Mes","wind_dir3pm")) %>%
mutate(wind_speed3pm = coalesce(wind_speed3pm, Media_wind_speed3pm)) %>%
select(-Media_wind_speed3pm)
Todos os valores com exceção dos sobre o vento mais forte foram preenchidos dos dados sobre ventos.
summary(wind_data_clean_4)
## date location wind_gustdir wind_gustspeed wind_dir9am wind_dir3pm wind_speed9am
## Min. :2007-11-01 Canberra: 3418 W :10688 Min. : 6.00 N :11475 SE :11365 Min. : 0
## 1st Qu.:2011-01-06 Sydney : 3337 SE : 9452 1st Qu.: 31.00 SE : 9170 W :10055 1st Qu.: 7
## Median :2013-05-27 Perth : 3193 E : 9349 Median : 39.00 E : 9097 S : 9677 Median : 13
## Mean :2013-04-01 Darwin : 3192 SSE : 9349 Mean : 40.05 SSE : 9026 WSW : 9559 Mean : 14
## 3rd Qu.:2015-06-12 Hobart : 3188 N : 9116 3rd Qu.: 48.00 X : 8902 SW : 9231 3rd Qu.: 19
## Max. :2017-06-25 Brisbane: 3161 (Other):88268 Max. :135.00 NW : 8680 SSE : 9156 Max. :130
## (Other) :122704 NA's : 5971 NA's :5971 (Other):85843 (Other):83150
## wind_speed3pm Mes
## Min. : 0.00 Min. : 1.000
## 1st Qu.:13.00 1st Qu.: 3.000
## Median :19.00 Median : 6.000
## Mean :18.62 Mean : 6.403
## 3rd Qu.:24.00 3rd Qu.: 9.000
## Max. :87.00 Max. :12.000
##
Realizando join da tabela principal com a de vento e verificando que as colunas que ainda possuem NA são: evaporation, sunshine, pressure9am, pressure3pm, cloud9am, cloud3pm, wind_gustdir e wind_gustspeed
wind_data_clean_4 = wind_data_clean_4 %>% select(-Mes)
rain_wind_data = rain_data_clean_1 %>% left_join(wind_data_clean_4, by = c("location","date"))
summary(rain_wind_data)
## date location mintemp maxtemp rainfall evaporation sunshine
## Min. :2007-11-01 Canberra: 3418 Min. :-8.50 Min. :-4.80 Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.:2011-01-06 Sydney : 3337 1st Qu.: 7.60 1st Qu.:17.90 1st Qu.: 0.000 1st Qu.: 2.60 1st Qu.: 5.40
## Median :2013-05-27 Perth : 3193 Median :12.00 Median :22.60 Median : 0.000 Median : 4.60 Median : 8.20
## Mean :2013-04-01 Darwin : 3192 Mean :12.19 Mean :23.23 Mean : 2.327 Mean : 5.31 Mean : 7.64
## 3rd Qu.:2015-06-12 Hobart : 3188 3rd Qu.:16.80 3rd Qu.:28.20 3rd Qu.: 0.600 3rd Qu.: 7.20 3rd Qu.:10.30
## Max. :2017-06-25 Brisbane: 3161 Max. :33.90 Max. :48.10 Max. :371.000 Max. :145.00 Max. :14.50
## (Other) :122704 NA's :45482 NA's :52071
## humidity9am humidity3pm pressure9am pressure3pm cloud9am cloud3pm temp9am
## Min. : 0.00 Min. : 0.00 Min. : 980.5 Min. : 977.1 Min. :0.00 Min. :0.00 Min. :-7.20
## 1st Qu.: 57.00 1st Qu.: 37.00 1st Qu.:1013.0 1st Qu.:1010.5 1st Qu.:2.00 1st Qu.:2.00 1st Qu.:12.20
## Median : 70.00 Median : 52.00 Median :1017.6 Median :1015.2 Median :5.00 Median :5.00 Median :16.70
## Mean : 68.92 Mean : 51.59 Mean :1017.6 Mean :1015.2 Mean :4.57 Mean :4.58 Mean :16.96
## 3rd Qu.: 83.00 3rd Qu.: 66.00 3rd Qu.:1022.3 3rd Qu.:1019.9 3rd Qu.:7.00 3rd Qu.:7.00 3rd Qu.:21.50
## Max. :100.00 Max. :100.00 Max. :1041.0 Max. :1039.6 Max. :9.00 Max. :9.00 Max. :40.20
## NA's :11781 NA's :11781 NA's :34034 NA's :34034
## temp3pm raintoday amountOfRain raintomorrow temp humidity precipitation3pm precipitation9am
## Min. :-5.40 No :109332 Min. : 0.000 No :110316 Min. :-3.76 Min. : 2.00 Min. : 0.00 Min. :-17.74
## 1st Qu.:16.60 Yes : 31455 1st Qu.: 0.000 Yes: 31877 1st Qu.:22.52 1st Qu.: 44.00 1st Qu.: 8.00 1st Qu.: 6.65
## Median :21.10 NA's: 1406 Median : 0.000 Median :28.52 Median : 63.20 Median :10.00 Median : 10.00
## Mean :21.72 Mean : 2.361 Mean :28.51 Mean : 62.11 Mean :10.01 Mean : 10.00
## 3rd Qu.:26.50 3rd Qu.: 0.800 3rd Qu.:35.48 3rd Qu.: 80.00 3rd Qu.:12.00 3rd Qu.: 13.39
## Max. :46.70 Max. :371.000 Max. :59.72 Max. :122.00 Max. :26.00 Max. : 32.48
##
## Dia Mes Ano Week wind_gustdir wind_gustspeed wind_dir9am
## 1 : 4688 5 :13055 2016 :17508 19 : 2960 W :10688 Min. : 6.00 N :11475
## 15 : 4688 3 :13036 2014 :17400 22 : 2957 SE : 9452 1st Qu.: 31.00 SE : 9170
## 16 : 4687 1 :12921 2015 :17231 20 : 2948 E : 9349 Median : 39.00 E : 9097
## 2 : 4686 6 :12389 2009 :16595 23 : 2946 SSE : 9349 Mean : 40.05 SSE : 9026
## 13 : 4686 10 :11804 2010 :16419 10 : 2945 N : 9116 3rd Qu.: 48.00 X : 8902
## 6 : 4684 7 :11779 2013 :16097 25 : 2944 (Other):88268 Max. :135.00 NW : 8680
## (Other):114074 (Other):67209 (Other):40943 (Other):124493 NA's : 5971 NA's :5971 (Other):85843
## wind_dir3pm wind_speed9am wind_speed3pm
## SE :11365 Min. : 0 Min. : 0.00
## W :10055 1st Qu.: 7 1st Qu.:13.00
## S : 9677 Median : 13 Median :19.00
## WSW : 9559 Mean : 14 Mean :18.62
## SW : 9231 3rd Qu.: 19 3rd Qu.:24.00
## SSE : 9156 Max. :130 Max. :87.00
## (Other):83150
Antes de utilizar a clusterização para preencher os dados faltantes, irei realizar uma clusterização utilizado somente os dados de coordenadas (externos) para a criação de uma noção de proximidade física das localidades. Utilizando o método de cotovelo, foi selecionado o número 4 de clusters.
data = coords[,-1]
elbow(data,20)
## NULL
mod_kmeans = kmeans(data, 4, nstart=50,iter.max = 15 )
rownames(data) <- paste0(coords$city)
fviz_cluster(mod_kmeans, data = data)
coords$Grupo_Prox = as.factor(mod_kmeans$cluster)
rain_wind_data_coords = rain_wind_data %>% left_join(coords, by=c("location"="city"))
pal <- colorFactor(c("blue","yellow","green","red","purple","black"), domain = levels(coords$Grupo_Prox))
leaflet(coords) %>% addProviderTiles(providers$OpenTopoMap) %>% addCircleMarkers(lng = ~lng, lat = ~lat, color = ~pal(Grupo_Prox), stroke = FALSE,fillOpacity = .8, popup = ~city) %>% addLegend("topright", pal = pal, values = ~Grupo_Prox,
title = "Grupo Proximidade",
labFormat = labelFormat(prefix = ""),
opacity = 1
)
Agora será realizado uma clusterização com as colunas que não possuem NA, para encontrar regiões com comportamento parecido na esperança de utilizar a média mensal desses clusters para preencher os NA restantes, sob a hipótese de que esses clusters possam agregar da melhor forma possível regiões semelhantes. Esses clusters criados não serão utilizados para propósito de modelagem para evitar redundância.
cols_sem_na = c("location","mintemp", "maxtemp", "rainfall", "humidity9am", "humidity3pm", "temp9am", "temp3pm",
"amountOfRain", "temp", "humidity", "precipitation3pm", "precipitation9am", "wind_dir9am", "wind_dir3pm",
"wind_speed9am", "wind_speed3pm","lat","lng")
df_clust = rain_wind_data_coords[,cols_sem_na]
df_clust$wind_dir3pm = as.character(df_clust$wind_dir3pm)
df_clust$wind_dir9am = as.character(df_clust$wind_dir9am)
df_clust = df_clust %>% group_by(location) %>%
summarise_all(funs(if(is.numeric(.)) mean(., na.rm = TRUE) else getmode(.)))
df_clust$wind_dir3pm = as.factor(df_clust$wind_dir3pm)
df_clust$wind_dir9am = as.factor(df_clust$wind_dir9am)
df_clust_prep = df_clust %>% mutate_if(is.numeric,scale)
data = model.matrix(location~.,df_clust_prep)[,-1]
elbow(data,20)
## NULL
#Escolhido 5 clusters
mod_kmeans = kmeans(data, 5, nstart=50,iter.max = 15 )
rownames(data) <- paste0(df_clust$location)
fviz_cluster(mod_kmeans, data = data)
df_clust$Cluster = as.factor(mod_kmeans$cluster)
pal <- colorFactor(c("blue","yellow","green","red","purple","black"), domain = levels(df_clust$Cluster))
leaflet(df_clust) %>% addProviderTiles(providers$OpenTopoMap) %>% addCircleMarkers(lng = ~lng, lat = ~lat, color = ~pal(Cluster), stroke = FALSE,fillOpacity = .8, popup = ~location) %>% addLegend("topright", pal = pal, values = ~Cluster,
title = "Cluster",
labFormat = labelFormat(prefix = ""),
opacity = 1
)
rain_wind_data_cl = rain_wind_data_coords %>% left_join(df_clust %>% select(location,Cluster),by="location")
Agora o agrupamento será realizado a partir dos clusters encontrados ao invés da localização e a partir do mês. Criação de uma variável com a variação de temperatura do dia.
clust_loc = rain_wind_data_cl %>% group_by(Cluster,Mes) %>%
summarise(Var_evaporation = mean(evaporation, na.rm = T),
Var_sunshine = mean(sunshine, na.rm = T),
Var_pressure9am = mean(pressure9am, na.rm = T),
Var_pressure3pm = mean(pressure3pm, na.rm = T),
Var_cloud9am = mean(cloud9am, na.rm = T),
Var_cloud3pm = mean(cloud3pm, na.rm = T),
Var_wind_gustdir = getmode(wind_gustdir),
Var_wind_gustspeed = mean(wind_gustspeed, na.rm = T),
Var_raintoday = getmode(raintoday))
rain_wind_data_final = rain_wind_data_cl %>% left_join(clust_loc, by = c("Cluster","Mes")) %>%
mutate(evaporation = coalesce(evaporation,Var_evaporation),
sunshine = coalesce(sunshine,Var_sunshine),
pressure9am = coalesce(pressure9am,Var_pressure9am),
pressure3pm = coalesce(pressure3pm,Var_pressure3pm),
cloud9am = coalesce(cloud9am,Var_cloud9am),
cloud3pm = coalesce(cloud3pm,Var_cloud3pm),
wind_gustdir =coalesce(wind_gustdir,Var_wind_gustdir),
wind_gustspeed =
coalesce(wind_gustspeed,Var_wind_gustspeed),
raintoday = coalesce(raintoday,Var_raintoday)) %>%
select(-contains("Var_"),-Cluster)
rain_wind_data_final = rain_wind_data_final %>% mutate(Var_Temp = maxtemp - mintemp)
Preenchendo assim todos os dados faltantes que restavam.
summary(rain_wind_data_final)
## date location mintemp maxtemp rainfall evaporation sunshine
## Min. :2007-11-01 Length:142193 Min. :-8.50 Min. :-4.80 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:2011-01-06 Class :character 1st Qu.: 7.60 1st Qu.:17.90 1st Qu.: 0.000 1st Qu.: 2.630 1st Qu.: 5.800
## Median :2013-05-27 Mode :character Median :12.00 Median :22.60 Median : 0.000 Median : 4.620 Median : 7.782
## Mean :2013-04-01 Mean :12.19 Mean :23.23 Mean : 2.327 Mean : 5.198 Mean : 7.528
## 3rd Qu.:2015-06-12 3rd Qu.:16.80 3rd Qu.:28.20 3rd Qu.: 0.600 3rd Qu.: 7.000 3rd Qu.: 9.500
## Max. :2017-06-25 Max. :33.90 Max. :48.10 Max. :371.000 Max. :145.000 Max. :14.500
##
## humidity9am humidity3pm pressure9am pressure3pm cloud9am cloud3pm temp9am
## Min. : 0.00 Min. : 0.00 Min. : 980.5 Min. : 977.1 Min. :0.000 Min. :0.000 Min. :-7.20
## 1st Qu.: 57.00 1st Qu.: 37.00 1st Qu.:1013.3 1st Qu.:1010.8 1st Qu.:3.000 1st Qu.:3.163 1st Qu.:12.20
## Median : 70.00 Median : 52.00 Median :1017.7 Median :1015.4 Median :5.000 Median :4.951 Median :16.70
## Mean : 68.92 Mean : 51.59 Mean :1017.7 Mean :1015.3 Mean :4.658 Mean :4.649 Mean :16.96
## 3rd Qu.: 83.00 3rd Qu.: 66.00 3rd Qu.:1022.1 3rd Qu.:1019.7 3rd Qu.:6.547 3rd Qu.:6.000 3rd Qu.:21.50
## Max. :100.00 Max. :100.00 Max. :1041.0 Max. :1039.6 Max. :9.000 Max. :9.000 Max. :40.20
##
## temp3pm raintoday amountOfRain raintomorrow temp humidity precipitation3pm precipitation9am
## Min. :-5.40 No :110734 Min. : 0.000 No :110316 Min. :-3.76 Min. : 2.00 Min. : 0.00 Min. :-17.74
## 1st Qu.:16.60 Yes: 31459 1st Qu.: 0.000 Yes: 31877 1st Qu.:22.52 1st Qu.: 44.00 1st Qu.: 8.00 1st Qu.: 6.65
## Median :21.10 Median : 0.000 Median :28.52 Median : 63.20 Median :10.00 Median : 10.00
## Mean :21.72 Mean : 2.361 Mean :28.51 Mean : 62.11 Mean :10.01 Mean : 10.00
## 3rd Qu.:26.50 3rd Qu.: 0.800 3rd Qu.:35.48 3rd Qu.: 80.00 3rd Qu.:12.00 3rd Qu.: 13.39
## Max. :46.70 Max. :371.000 Max. :59.72 Max. :122.00 Max. :26.00 Max. : 32.48
##
## Dia Mes Ano Week wind_gustdir wind_gustspeed wind_dir9am
## 1 : 4688 5 :13055 2016 :17508 19 : 2960 W :13153 Min. : 6.00 N :11475
## 15 : 4688 3 :13036 2014 :17400 22 : 2957 N :10387 1st Qu.: 31.00 SE : 9170
## 16 : 4687 1 :12921 2015 :17231 20 : 2948 SSE :10314 Median : 39.00 E : 9097
## 2 : 4686 6 :12389 2009 :16595 23 : 2946 SE : 9944 Mean : 40.02 SSE : 9026
## 13 : 4686 10 :11804 2010 :16419 10 : 2945 E : 9349 3rd Qu.: 46.53 X : 8902
## 6 : 4684 7 :11779 2013 :16097 25 : 2944 ENE : 9051 Max. :135.00 NW : 8680
## (Other):114074 (Other):67209 (Other):40943 (Other):124493 (Other):79995 (Other):85843
## wind_dir3pm wind_speed9am wind_speed3pm lat lng Grupo_Prox Var_Temp
## SE :11365 Min. : 0 Min. : 0.00 Min. :-42.85 Min. :115.1 1:20706 Min. :-1.895
## W :10055 1st Qu.: 7 1st Qu.:13.00 1st Qu.:-36.06 1st Qu.:138.6 2:55323 1st Qu.: 7.200
## S : 9677 Median : 13 Median :19.00 Median :-33.94 Median :145.8 3:15324 Median :10.500
## WSW : 9559 Mean : 14 Mean :18.62 Mean :-32.75 Mean :142.1 4:50840 Mean :11.039
## SW : 9231 3rd Qu.: 19 3rd Qu.:24.00 3rd Qu.:-31.47 3rd Qu.:150.7 3rd Qu.:14.500
## SSE : 9156 Max. :130 Max. :87.00 Max. :-12.43 Max. :168.0 Max. :31.200
## (Other):83150
Divisão dos dados em treino (80%), teste (10%) e validação(10%)
rain_wind_data_final$raintoday = ifelse(rain_wind_data_final$raintoday == "Yes",1,0)
set.seed(1234)
#part = createDataPartition(rain_wind_data_final$raintomorrow,p=.8,list = F)
rain_wind_data_final$raintomorrow = as.factor(ifelse(rain_wind_data_final$raintomorrow=="Yes",1,0))
rain_wind_data_final$location = as.factor(rain_wind_data_final$location)
train = rain_wind_data_final[part,]
test = rain_wind_data_final[-part,]
set.seed(1234)
#part_val = createDataPartition(test$raintomorrow,p=.5,list = F)
validation = test[part_val,]
test = test[-part_val,]
trainctrl <- trainControl(verboseIter = TRUE,method="cv", number=3,allowParallel = TRUE)
Criação da variável Choveu_Grupo, que verifica se em alguma das localidades do grupo de proximidade houve relato de chuva naquele dia. Criação dessa variável nos dados de treino, para evitar overfitting.
chuva_prox = train %>% group_by(Grupo_Prox,date) %>%
summarise(Choveu_Grupo = as.factor(max(raintoday,na.rm = T)))
train$raintoday = as.factor(train$raintoday)
train = train %>% left_join(chuva_prox)
## Joining, by = c("date", "Grupo_Prox")
train$Choveu_Grupo = as.factor(train$Choveu_Grupo)
prop.table(table(Chove_Amanha = train$raintomorrow,Choveu_Proximo = train$Choveu_Grupo),margin=1)
## Choveu_Proximo
## Chove_Amanha 0 1
## 0 0.4177422 0.5822578
## 1 0.1720257 0.8279743
prep = preProcess(train,c('center','scale'))
train = predict(prep,train)
test = test %>% left_join(chuva_prox)
## Joining, by = c("date", "Grupo_Prox")
test$Choveu_Grupo[is.na(test$Choveu_Grupo)] = 0
test$Choveu_Grupo = as.factor(test$Choveu_Grupo)
test$raintoday = as.factor(test$raintoday)
test = predict(prep,test)
Podemos notar que aproximadamente 80% das vezes que chove no dia seguinte, houve alguma chuva dentro do grupo de proximidade.
Como a variável resposta é muito desbalanceada, utilizarei um método de up sampling para equilibrá-la.
sim = train[train$raintomorrow==1,]
nao = train[!train$raintomorrow==1,]
set.seed(1234)
#samp_sim = sample(nrow(sim),50000,replace = T)
set.seed(1234)
#samp_nao = sample(nrow(nao),50000,replace = F)
train_bal = rbind(sim[samp_sim,],nao[samp_nao,])
Seleção das variáveis que serão utilizadas na composição do modelo inicial.
vars_modelo = c("location", "mintemp", "maxtemp", "rainfall", "evaporation", "sunshine",
"humidity9am", "humidity3pm", "pressure9am", "pressure3pm", "cloud9am", "cloud3pm", "temp9am",
"temp3pm", "raintoday", "temp", "humidity", "precipitation3pm", "precipitation9am", "Mes", "wind_gustdir", "wind_gustspeed", "wind_dir9am", "wind_dir3pm", "wind_speed9am", "wind_speed3pm", "Grupo_Prox", "Var_Temp", "Choveu_Grupo", "raintomorrow")
train_bal = train_bal[,vars_modelo]
train = train[,vars_modelo]
#model_rpart = train(raintomorrow~.,train_bal,"rpart",tuneLength = 3,trControl = trainctrl)
pred_rpart = predict(model_rpart,test)
confusionMatrix(pred_rpart,test$raintomorrow)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8190 821
## 1 2841 2366
##
## Accuracy : 0.7424
## 95% CI : (0.7352, 0.7496)
## No Information Rate : 0.7758
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3957
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7425
## Specificity : 0.7424
## Pos Pred Value : 0.9089
## Neg Pred Value : 0.4544
## Prevalence : 0.7758
## Detection Rate : 0.5760
## Detection Prevalence : 0.6338
## Balanced Accuracy : 0.7424
##
## 'Positive' Class : 0
##
aux_rpart = rocp(model_rpart,test,test$raintomorrow)
## NULL
acc_rpart = confusionMatrix(pred_rpart,test$raintomorrow)$overall[1]
plot_density_pred(model_rpart,test)
teste = predict(model_rpart,test,"prob")[,2]
teste2 = as.factor(ifelse(teste>.3,1,0))
confusionMatrix(teste2,test$raintomorrow)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6162 527
## 1 4869 2660
##
## Accuracy : 0.6205
## 95% CI : (0.6124, 0.6285)
## No Information Rate : 0.7758
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2649
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5586
## Specificity : 0.8346
## Pos Pred Value : 0.9212
## Neg Pred Value : 0.3533
## Prevalence : 0.7758
## Detection Rate : 0.4334
## Detection Prevalence : 0.4705
## Balanced Accuracy : 0.6966
##
## 'Positive' Class : 0
##
#model_logistic = train(raintomorrow~.,train_bal,"glm",family="binomial")
pred_logistic = predict(model_logistic,test)
confusionMatrix(pred_logistic,test$raintomorrow)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8834 702
## 1 2197 2485
##
## Accuracy : 0.7961
## 95% CI : (0.7894, 0.8027)
## No Information Rate : 0.7758
## P-Value [Acc > NIR] : 2.539e-09
##
## Kappa : 0.4976
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8008
## Specificity : 0.7797
## Pos Pred Value : 0.9264
## Neg Pred Value : 0.5308
## Prevalence : 0.7758
## Detection Rate : 0.6213
## Detection Prevalence : 0.6707
## Balanced Accuracy : 0.7903
##
## 'Positive' Class : 0
##
aux_logistic = rocp(model_logistic,test,test$raintomorrow)
## NULL
acc_logistic = confusionMatrix(pred_logistic,test$raintomorrow)$overall[1]
plot_density_pred(model_logistic,test)
#model_xgb = train(raintomorrow~.,train_bal,"xgbTree",tuneLength = 3,trControl = trainctrl)
pred_xgb = predict(model_xgb,test)
confusionMatrix(pred_xgb,test$raintomorrow)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9153 753
## 1 1878 2434
##
## Accuracy : 0.815
## 95% CI : (0.8085, 0.8213)
## No Information Rate : 0.7758
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5273
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8298
## Specificity : 0.7637
## Pos Pred Value : 0.9240
## Neg Pred Value : 0.5645
## Prevalence : 0.7758
## Detection Rate : 0.6438
## Detection Prevalence : 0.6967
## Balanced Accuracy : 0.7967
##
## 'Positive' Class : 0
##
aux_xgb = rocp(model_xgb,test,test$raintomorrow)
## NULL
acc_xgb = confusionMatrix(pred_xgb,test$raintomorrow)$overall[1]
plot_density_pred(model_xgb,test)
#model_ctree = train(raintomorrow~.,train_bal,"ctree",tuneLength = 3,trControl = trainctrl)
pred_ctree = predict(model_ctree,test)
confusionMatrix(pred_ctree,test$raintomorrow)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8526 880
## 1 2505 2307
##
## Accuracy : 0.7619
## 95% CI : (0.7548, 0.7689)
## No Information Rate : 0.7758
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4206
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7729
## Specificity : 0.7239
## Pos Pred Value : 0.9064
## Neg Pred Value : 0.4794
## Prevalence : 0.7758
## Detection Rate : 0.5997
## Detection Prevalence : 0.6616
## Balanced Accuracy : 0.7484
##
## 'Positive' Class : 0
##
auc_ctree = rocp(model_ctree,test,test$raintomorrow)
## NULL
acc_ctree = confusionMatrix(pred_ctree,test$raintomorrow)$overall[1]
plot_density_pred(model_ctree,test)
O XGBoost possui um hiperparâmetro específico para lidar com amostras desbalanceadas chamado scale_pos_weight, o valor recomendado para elé número de amostras negativas/número de amostras positivas.
#mod_xgb_weighted = train(raintomorrow~. ,train,"xgbTree", tuneLength = 3,trControl = trainctrl, scale_pos_weight = 3.46)
pred_xgb_weighted = predict(mod_xgb_weighted,test)
confusionMatrix(pred_xgb_weighted,test$raintomorrow)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10898 2179
## 1 133 1008
##
## Accuracy : 0.8374
## 95% CI : (0.8312, 0.8434)
## No Information Rate : 0.7758
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3942
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9879
## Specificity : 0.3163
## Pos Pred Value : 0.8334
## Neg Pred Value : 0.8834
## Prevalence : 0.7758
## Detection Rate : 0.7665
## Detection Prevalence : 0.9197
## Balanced Accuracy : 0.6521
##
## 'Positive' Class : 0
##
auc_xgb_weighted=rocp(mod_xgb_weighted,test,test$raintomorrow)
## NULL
acc_xgb_weighted = confusionMatrix(pred_xgb_weighted,test$raintomorrow)$overall[1]
plot_density_pred(mod_xgb_weighted,test)
#mod_rf = train(raintomorrow~. ,train_bal,"rf", tuneLength = 3,trControl = trainctrl)
pred_rf = predict(mod_rf,test)
confusionMatrix(pred_rf,test$raintomorrow)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9841 1050
## 1 1190 2137
##
## Accuracy : 0.8425
## 95% CI : (0.8364, 0.8484)
## No Information Rate : 0.7758
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.554
##
## Mcnemar's Test P-Value : 0.003315
##
## Sensitivity : 0.8921
## Specificity : 0.6705
## Pos Pred Value : 0.9036
## Neg Pred Value : 0.6423
## Prevalence : 0.7758
## Detection Rate : 0.6922
## Detection Prevalence : 0.7660
## Balanced Accuracy : 0.7813
##
## 'Positive' Class : 0
##
auc_rf = rocp(mod_rf,test,test$raintomorrow)
## NULL
acc_rf = confusionMatrix(pred_rf,test$raintomorrow)$overall[1]
plot_density_pred(mod_rf,test)
Na tentativa de melhorar a performance do modelo do XGBoost, será realizado um tuning dos seus hiperparâmetros.
nrounds <- 1000
tune_grid <- expand.grid(
nrounds = seq(from = 200, to = nrounds, by = 50),
eta = c(0.025, 0.05, 0.1, 0.3),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
trainctrl <- trainControl(verboseIter = TRUE,method="cv", number=3)
#tune_xgb1 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid)
ggplot(tune_xgb1) + theme_bw() + ylim(c(min(tune_xgb1$results$Accuracy),1))
maxd = tune_xgb1$bestTune$max_depth
tune_grid2 <- expand.grid(
nrounds = seq(from = 50, to = nrounds, by = 50),
eta = tune_xgb1$bestTune$eta,
max_depth = c((maxd-1):(maxd+1)),
gamma = 0,
colsample_bytree = 1,
min_child_weight = c(1, 2, 3),
subsample = 1
)
#tune_xgb2 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid2)
ggplot(tune_xgb2) + theme_bw() + ylim(c(min(tune_xgb2$results$Accuracy),1))
tune_grid3 <- expand.grid(
nrounds = seq(from = 50, to = nrounds, by = 50),
eta = tune_xgb1$bestTune$eta,
max_depth = 6,
gamma = 0,
colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
min_child_weight = tune_xgb2$bestTune$min_child_weight,
subsample = c(0.5, 0.75, 1.0)
)
#tune_xgb3 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid3)
ggplot(tune_xgb3) + theme_bw() + ylim(c(min(tune_xgb3$results$Accuracy),1))
tune_grid4 <- expand.grid(
nrounds = seq(from = 50, to = nrounds, by = 50),
eta = tune_xgb1$bestTune$eta,
max_depth = 6,
gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0),
colsample_bytree = tune_xgb3$bestTune$colsample_bytree,
min_child_weight = tune_xgb2$bestTune$min_child_weight,
subsample = tune_xgb3$bestTune$subsample
)
#tune_xgb4 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid4)
ggplot(tune_xgb4) + theme_bw() + ylim(c(min(tune_xgb4$results$Accuracy),1))
## Warning: The shape palette can deal with a maximum of 6 discrete values because more than 6 becomes difficult to
## discriminate; you have 7. Consider specifying shapes manually if you must have them.
## Warning: Removed 20 rows containing missing values (geom_point).
tune_grid5 <- expand.grid(
nrounds = seq(from = 100, to = 10000, by = 100),
eta = c(0.01, 0.015, 0.025, 0.05, 0.1),
max_depth = 6,
gamma = tune_xgb4$bestTune$gamma,
colsample_bytree = tune_xgb3$bestTune$colsample_bytree,
min_child_weight = tune_xgb2$bestTune$min_child_weight,
subsample = tune_xgb3$bestTune$subsample
)
#tune_xgb5 = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = tune_grid5)
ggplot(tune_xgb5) + theme_bw() + ylim(c(min(tune_xgb5$results$Accuracy),1))
final_grid <- expand.grid(
nrounds = 2800,
eta = 0.1,
max_depth = 6,
gamma = tune_xgb5$bestTune$gamma,
colsample_bytree = tune_xgb5$bestTune$colsample_bytree,
min_child_weight = tune_xgb5$bestTune$min_child_weight,
subsample = tune_xgb5$bestTune$subsample)
#mod_xgb_tuned = train(raintomorrow~.,train_bal,"xgbTree",trControl = trainctrl,tuneGrid = final_grid)
pred_xgb_tuned = predict(mod_xgb_tuned,test)
confusionMatrix(pred_xgb_tuned,test$raintomorrow)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9700 963
## 1 1331 2224
##
## Accuracy : 0.8387
## 95% CI : (0.8325, 0.8447)
## No Information Rate : 0.7758
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5544
##
## Mcnemar's Test P-Value : 1.824e-14
##
## Sensitivity : 0.8793
## Specificity : 0.6978
## Pos Pred Value : 0.9097
## Neg Pred Value : 0.6256
## Prevalence : 0.7758
## Detection Rate : 0.6822
## Detection Prevalence : 0.7500
## Balanced Accuracy : 0.7886
##
## 'Positive' Class : 0
##
auc_xgb_tuned = rocp(mod_xgb_tuned,test,test$raintomorrow)
## NULL
acc_xgb_tuned = confusionMatrix(pred_xgb_tuned,test$raintomorrow)$overall[1]
plot_density_pred(mod_xgb_tuned,test)
accs = c(acc_rpart,acc_logistic,acc_xgb,acc_ctree,acc_xgb_weighted,acc_rf, acc_xgb_tuned)
aucs = c(aux_rpart[[1]],aux_logistic[[1]],aux_xgb[[1]],auc_ctree[[1]],auc_xgb_weighted[[1]],auc_rf[[1]],auc_xgb_tuned[[1]])
df = data.frame(Modelo = c("Decision Tree",
"Logistic Regression",
"XGBoost","Conditional Tree",
"XGBoost Unbalanced", "Random Forest", "XGBoost Tuned"), Accuracy=accs,AUC=aucs)
df
## Modelo Accuracy AUC
## 1 Decision Tree 0.7424392 0.7660589
## 2 Logistic Regression 0.7961035 0.8722469
## 3 XGBoost 0.8149529 0.8820449
## 4 Conditional Tree 0.7619215 0.8173365
## 5 XGBoost Unbalanced 0.8373892 0.8827417
## 6 Random Forest 0.8424532 0.8846150
## 7 XGBoost Tuned 0.8386552 0.8847764
preds_list = list(predict(model_rpart,test,"prob")[,2],
predict(model_logistic,test,"prob")[,2],
predict(model_xgb,test,"prob")[,2],
predict(model_ctree,test,"prob")[,2],
predict(mod_xgb_weighted,test,"prob")[,2],
predict(mod_rf,test,"prob")[,2],
predict(mod_xgb_tuned,test,"prob")[,2])
m <- length(preds_list)
actuals_list <- rep(list(test$raintomorrow), m)
pred <- prediction(preds_list, actuals_list)
rocs <- performance(pred, "tpr", "fpr")
plot(rocs, col = as.list(1:m), main = "Test Set ROC Curves")
legend(x = "bottomright",
legend = c("Decision Tree",
"Logistic Regression",
"XGBoost","Conditional Tree",
"XGBoost Unbalanced", "Random Forest","XGBoost Tuned"),
fill = 1:m)
O modelo escohido foi XGBoost Tuned, o método de seleção foi o AUC. O ponto de corte foi alterado para 0.35, atingindo assim as seguintes métricas para a base de validação.
validation = validation %>% left_join(chuva_prox)
## Joining, by = c("date", "Grupo_Prox")
validation$Choveu_Grupo[is.na(validation$Choveu_Grupo)] = 0
validation$Choveu_Grupo = as.factor(validation$Choveu_Grupo)
validation$raintoday = as.factor(validation$raintoday)
validation = predict(prep,validation)
pred_thresh = predict(mod_xgb_tuned,validation,"prob")[,2]
pred_thresh = as.factor(ifelse(pred_thresh>.35,1,0))
confusionMatrix(pred_thresh,validation$raintomorrow)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9183 722
## 1 1849 2466
##
## Accuracy : 0.8192
## 95% CI : (0.8128, 0.8255)
## No Information Rate : 0.7758
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5383
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8324
## Specificity : 0.7735
## Pos Pred Value : 0.9271
## Neg Pred Value : 0.5715
## Prevalence : 0.7758
## Detection Rate : 0.6458
## Detection Prevalence : 0.6966
## Balanced Accuracy : 0.8030
##
## 'Positive' Class : 0
##
O modelo escolhido possui um resultado satisfatório, entretanto necessita melhora para o caso dos falsos positivos, talvez a tentativa de algum outro modelo ou a criação de algumas outras variáveis possa criar uma melhora na performance.